In [1]:
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Disable 5_000 rows limit
pd.set_option('display.max_rows', 500) # Display 500 rows max
pd.set_option('display.max_columns', 500) # Display 500 columns max
pd.set_option('display.max_colwidth', 1000) # Expand maximum column text display
In [2]:
findings_data = pd.read_csv("https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/findings/findings.csv") # Set path accordingly
contests_data = pd.read_csv("https://raw.githubusercontent.com/code-423n4/code423n4.com/main/_data/contests/contests.csv")
submissions_data = pd.read_csv("https://raw.githubusercontent.com/Krow10/code4rena-scraper/master/github_code4rena.csv")
In [3]:
df = pd.merge(submissions_data, submissions_data["tags"].str.get_dummies(';'), how='outer', left_index=True, right_index=True)
df = df[df.contest >= 11] # Remove incorrect early contests ids
df["issueCreation"] = pd.to_datetime(df["issueCreation"])
In [4]:
contests_data.rename(columns={'contestid': 'contest'}, inplace=True)
In [5]:
df2 = findings_data.copy()
df2 = pd.merge(df2, df2.groupby(["contest", "pie"])['split'].count().reset_index().groupby("contest")["split"].sum(), on="contest")
df2 = df2.rename(columns={'split_x': 'split', 'split_y': 'total'})
df2 = pd.merge(df2, (df2.groupby(["contest", "pie"])["split"].count()-1).reset_index().groupby("contest")["split"].sum(), on="contest")
df2 = df2.rename(columns={'split_x': 'split', 'split_y': 'duplicates'})
df2 = pd.merge(df2, df2.groupby("contest")["awardUSD"].sum(), on="contest")
df2 = df2.rename(columns={'awardUSD_x': 'awardUSD', 'awardUSD_y': 'totalAwardUSD'})
df2 = pd.merge(df2, ((df2.groupby(["contest", "pie"])["split"].count()-1) * df2.groupby(["contest", "pie"])["awardUSD"].min()).reset_index().rename(columns={0: 'duplicates_award'}).groupby("contest")["duplicates_award"].sum(), on="contest")
df2 = pd.merge(df2, contests_data[["contest", "end_time"]], on="contest")
df2['end_time'] = pd.to_datetime(df2.end_time).dt.to_period("M").astype({"end_time": str})
/tmp/ipykernel_1861/2846388186.py:10: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df2['end_time'] = pd.to_datetime(df2.end_time).dt.to_period("M").astype({"end_time": str})
In [6]:
unselected_color = '#012749'
mean_color = '#fa4d56'
selected_color = '#82cfff'
In [7]:
select_date = alt.selection_interval(encodings=['x'], empty='all')
In [8]:
base = alt.Chart(df2, width=800, height=200).transform_calculate(
    duplicate_ratio='datum.duplicates/datum.total'
)
In [9]:
bars = base.mark_bar(
    size=10
).encode(
    x=alt.X('end_time:T', title=""),
    y=alt.Y('duplicate_ratio:Q', axis=alt.Axis(format='%'), title=""),
    color=alt.condition(
        select_date,
        alt.value(selected_color), alt.value(unselected_color)
    )
).add_selection(
    select_date
)
In [10]:
mean_dup_rule = base.mark_rule(color='red').encode(
    y='mean(duplicate_ratio):Q',
    color=alt.value(mean_color)
).transform_filter(
    select_date
)
In [11]:
base = alt.Chart(df2, width=800, height=200).transform_calculate(
    duplicate_money_ratio='datum.duplicates_award/datum.totalAwardUSD'
)
In [12]:
line = base.mark_line(
    point=True
).encode(
    x=alt.X('end_time:T', title=""),
    y=alt.Y('mean(duplicate_money_ratio):Q', axis=alt.Axis(format='%'), title=""),
    color=alt.condition(
        select_date,
        alt.value(selected_color), alt.value(unselected_color)
    )
).add_selection(
    select_date
)
In [13]:
mean_money_rule = base.mark_rule(color='red').encode(
    y='mean(duplicate_money_ratio):Q',
    color=alt.value(mean_color)
).transform_filter(
    select_date
)
In [14]:
date_slider = alt.Chart(df2[["end_time", "contest"]].drop_duplicates()).mark_bar(
    size=35
).encode(
    x=alt.X(
        'end_time:T', 
        title="Drag the mouse to select a period. Click outside of the selection to reset.", 
        axis=alt.Axis(grid=False)
    ),
    y=alt.Y('count(contest):Q', title="", axis=None),
    color=alt.condition(
        select_date,
        alt.value(selected_color), alt.value(unselected_color)
    )
).properties(
    width=800,
    height=50
).add_selection(
    select_date
)
In [15]:
((bars + mean_dup_rule).properties(title="Ratio of duplicate findings to total") & \
(line + mean_money_rule).properties(title="Percentage of the prize money captured by duplicate findings") & \
date_slider).configure_view(
    strokeWidth=0
)
/opt/hostedtoolcache/Python/3.11.1/x64/lib/python3.11/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
  for col_name, dtype in df.dtypes.iteritems():
Out[15]:
In [ ]: